Load packages and directories
GFW effort, bathymetry (GEBCO), DTS, DTP, SST, Chla
Fit models to test presence of fishing effort and run random forest to determine variable importance
Predict presence with best fit model
Create rules to spatialise effort into EEZ where industrial effort is mostly likely to be (by gear type and length?)
bathy <- rast(file.path(rdsi_dir, "prep/random_forest/bathymetry_0.1_res.tif"))
dts <- rast(file.path(rdsi_raw_dir, "global_fishing_watch/distance_from_shore/distance-from-shore_0.1.tif"))
dtp <- rast(file.path(rdsi_raw_dir, "global_fishing_watch/distance_from_port/distance-from-port-v1_0.1.tif"))
sst <- rast(file.path(rdsi_raw_dir, "noaa/sst/resample_0.1/sst_2020_0.1.tif"))
chla <- rast(file.path(rdsi_dir, "prep/random_forest/chla_2020_0.1.tif"))
effort_subset <- read.csv(file.path(rdsi_dir, "prep/random_forest/effort_2020_vut_arg_isl.csv")) %>%
rename(x = cell_ll_lon, y = cell_ll_lat)
eez_rast <- rast(here("int/eez_raster.tif"))
eez_rast_df <- fread(here("int/eez_raster_df.csv"))
rgn_keys <- read.csv(here("raw/ohi_region_key.csv"))
Start with Argentina model
Prep all rasters to Argentina’s EEZ and have a value for each explanatory variable in each cell
eez_arg_df <- eez_rast_df %>%
filter(rgn_id == 172) %>%
dplyr::select(x, y) %>%
mutate(eez = 1)
eez_arg_rast <- rast(eez_arg_df, crs = crs(eez_rast), type = "xyz")
# depth
bathy_arg <- crop(bathy, eez_arg_rast) %>% # crop to correct region
project(., eez_arg_rast) %>% # make sure extents match
mask(., eez_arg_rast) # now mask out any areas not in EEZ
## distance to short
dts_arg <- crop(dts, eez_arg_rast) %>% # crop to correct region
project(., eez_arg_rast) %>% # make sure extents match
mask(., eez_arg_rast) # now mask out any areas not in EEZ
# distance to port
dtp_arg <- crop(dtp, eez_arg_rast) %>% # crop to correct region
project(., eez_arg_rast) %>% # make sure extents match
mask(., eez_arg_rast) # now mask out any areas not in EEZ
# sst
sst_arg <- crop(sst, eez_arg_rast) %>% # crop to correct region
project(., eez_arg_rast) %>% # make sure extents match
mask(., eez_arg_rast) # now mask out any areas not in EEZ
chl_arg <- crop(chla, eez_arg_rast) %>% # crop to correct region
project(., eez_arg_rast) %>% # make sure extents match
mask(., eez_arg_rast) %>% # now mask out any areas not in EEZ
rename(chl = mean)
# effort hours
effort_arg <- effort_subset %>%
group_by(x, y) %>%
summarise(fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
rast(., type = "xyz", crs = crs(eez_arg_rast)) %>%
crop(., eez_arg_rast) %>%
project(., eez_arg_rast) %>%
tidyterra::mutate(fishing_hours = ifelse(is.na(fishing_hours), 0, fishing_hours)) %>%
mask(., eez_arg_rast) # ok so as.data.frame removes NAs?
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# now stack all rasters
all_rasts <- c(effort_arg, dtp_arg, dts_arg, bathy_arg, sst_arg, chl_arg) %>%
as.data.frame(., xy = TRUE, na.rm = NA) %>%
mutate(fis_pres = as.factor(ifelse(fishing_hours > 0, 1, 0))) %>%
#dplyr::select(-fishing_hours) %>%
filter(!is.na(bathymetry),
!is.na(mean),
!is.na(chl)) %>% # need to filter out NAs for RF to work
rename(dtp = 4, dts = 5, sst = mean)
all_env_stack <- c(dtp_arg, dts_arg, bathy_arg, sst_arg, chl_arg) %>%
rename(dtp = 1, dts = 2, sst = mean)
Start with random forest model on presence of fishing effort
Make models with magnitude of hours
all_rasts <- all_rasts %>%
filter(fishing_hours > 0)
# all_rasts_stack <- rast(all_rasts, type = "xyz")
## lets check the hours of effort against predictor variables
par(mfrow=c(3,2))
for(i in 4:8){
plot(all_rasts[,1]~all_rasts[,i], ylab="Effort hours", xlab=names(all_rasts)[i], main=paste0("Effort vs ",names(all_rasts)[i]))
l <- loess(all_rasts[,1]~all_rasts[,i])
j <- order(all_rasts[,i]) ## values need to be ordered for the red line to run along and not zigzag
lines(all_rasts[j,i],l$fitted[j], col="red")
}
## ok so depth is the most obvious polynomial. Potentially chl and sst. Let's just rock with depth to start
## Rounding will introduce only little error, so we'll round the values to be able to use poisson and negative binomial models, and compare each method with each other (e.g. poisson and gaussian model)
all_rasts$fishing_hours <- ceiling(all_rasts$fishing_hours)
##
effort.formula <- formula("fishing_hours ~ bathymetry+sst+chl+I(bathymetry^2)+dtp")
## we also setup the formula for the lognormal model, which uses the log-transformed response:
effort.formula.ln <- formula("log(fishing_hours) ~ bathymetry+sst+chl+I(bathymetry^2)+dtp")
## GLMs
## fitting the model and stepwise dropping variables using stepAIC, and the option "trace=FALSE" to not display each step
fit.n <- stepAIC(glm(effort.formula, data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
## check the summary and diagnostics
summary(fit.n)
##
## Call:
## glm(formula = fishing_hours ~ sst + chl + I(bathymetry^2) + dtp,
## family = gaussian(link = "identity"), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -231.7 -77.8 -40.9 0.7 9392.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.500e+02 1.883e+01 -7.970 1.82e-15 ***
## sst 8.771e+00 1.340e+00 6.545 6.33e-11 ***
## chl 1.871e+01 2.605e+00 7.183 7.44e-13 ***
## I(bathymetry^2) -1.134e-05 2.608e-06 -4.348 1.39e-05 ***
## dtp 4.493e-01 3.251e-02 13.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 77060.82)
##
## Null deviance: 611387028 on 7651 degrees of freedom
## Residual deviance: 589284070 on 7647 degrees of freedom
## AIC: 107825
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.n)
## AIC is realllllly big. This is a horrible model.
## lognormal
fit.ln <- stepAIC(glm("log(ceiling(fishing_hours)) ~ bathymetry+sst+chl+I(bathymetry^2)+dtp", data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
summary(fit.ln)
##
## Call:
## glm(formula = log(ceiling(fishing_hours)) ~ bathymetry + sst +
## chl + dtp, family = gaussian(link = "identity"), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1445 -1.3387 -0.0525 1.2512 7.0307
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.098e+00 1.172e-01 -9.372 <2e-16 ***
## bathymetry 8.584e-04 5.369e-05 15.987 <2e-16 ***
## sst 1.921e-01 8.326e-03 23.069 <2e-16 ***
## chl 1.471e-01 1.642e-02 8.959 <2e-16 ***
## dtp 6.248e-03 2.078e-04 30.064 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3.016351)
##
## Null deviance: 27626 on 7651 degrees of freedom
## Residual deviance: 23066 on 7647 degrees of freedom
## AIC: 30171
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.ln)
mu <- fitted(fit.ln)
sigma <- summary(fit.ln)$df.residual/nrow(all_rasts)*sqrt(summary(fit.ln)$dispersion)
-2*sum(log(dlnorm(all_rasts$fishing_hours,mu,sigma)))+2*(length(coef(fit.ln))+1)
## [1] 69875.41
## this one is a bit better, but still not great. AIC is lower and plots look not as skewed
## poisson
fit.p <- stepAIC(glm(effort.formula, data=all_rasts, family=poisson()), trace=FALSE)
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: fitted rates numerically 0 occurred
## Warning: glm.fit: fitted rates numerically 0 occurred
summary(fit.p)
##
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(bathymetry^2) +
## dtp, family = poisson(), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -24.696 -9.766 -6.361 -1.321 255.025
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.537e-01 1.156e-02 13.3 <2e-16 ***
## bathymetry -3.122e-03 2.322e-05 -134.5 <2e-16 ***
## sst 1.927e-01 7.303e-04 263.9 <2e-16 ***
## chl 1.340e-01 7.450e-04 179.9 <2e-16 ***
## I(bathymetry^2) -3.044e-06 2.261e-08 -134.6 <2e-16 ***
## dtp 5.862e-03 1.443e-05 406.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1898865 on 7651 degrees of freedom
## Residual deviance: 1596592 on 7646 degrees of freedom
## AIC: 1630853
##
## Number of Fisher Scoring iterations: 9
par(mfrow=c(2,2))
plot(fit.p)
## yikes... by far the worst
fit.rf <- randomForest(effort.formula, data=all_rasts, ntree=500)
fit.rf
##
## Call:
## randomForest(formula = effort.formula, data = all_rasts, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 52258.82
## % Var explained: 34.59
par(mfrow=c(2,1))
plot(fit.rf)
varImpPlot(fit.rf) # interesting, dtp is most important according to this. Looks like depth isn't so important anymore (this would probably change if split by gear type, species, etc)
## GAMs - add smooth spatial term
fit.gam <- gam(ceiling(fishing_hours) ~ bathymetry+sst+I(bathymetry^2)+chl+ dtp+ s(x,y), data=all_rasts, family=nb())
summary(fit.gam)
##
## Family: Negative Binomial(0.527)
## Link function: log
##
## Formula:
## ceiling(fishing_hours) ~ bathymetry + sst + I(bathymetry^2) +
## chl + dtp + s(x, y)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.170e+01 7.746e-01 15.104 < 2e-16 ***
## bathymetry -2.830e-04 1.668e-04 -1.696 0.0898 .
## sst -7.503e-01 6.339e-02 -11.835 < 2e-16 ***
## I(bathymetry^2) -1.658e-07 4.190e-08 -3.958 7.56e-05 ***
## chl 4.046e-01 2.783e-02 14.539 < 2e-16 ***
## dtp 1.016e-03 9.146e-04 1.111 0.2664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(x,y) 28.54 28.98 2708 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0425 Deviance explained = 34.9%
## -REML = 35635 Scale est. = 1 n = 7652
par(mfrow=c(2,2))
plot(fit.gam)
## R2 not great... but not a huge deal. Not really sure how to interpret the plot...
fit.gam2 <- gam(ceiling(fishing_hours) ~ s(bathymetry)+s(sst)+s(chl)+s(x, y), data=all_rasts, family=nb())
summary(fit.gam2)
##
## Family: Negative Binomial(0.584)
## Link function: log
##
## Formula:
## ceiling(fishing_hours) ~ s(bathymetry) + s(sst) + s(chl) + s(x,
## y)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.64534 0.01523 239.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bathymetry) 8.743 8.980 321.2 <2e-16 ***
## s(sst) 8.845 8.990 807.8 <2e-16 ***
## s(chl) 8.758 8.982 569.4 <2e-16 ***
## s(x,y) 28.680 28.986 3936.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0803 Deviance explained = 42.2%
## -REML = 35156 Scale est. = 1 n = 7652
par(mfrow=c(2,2))
plot(fit.gam2)
## seems a bit better? Based on R2 and deviance explained... however everythign is significant, so maybe not
Make predictions
#extract both mean and se of predictions
predfun <- function(model, data) {
v <- predict(model, data, type="response", se.fit=TRUE)
cbind(p=as.vector(v$fit), se=as.vector(v$se.fit))
}
## for the lognormal:
predfun.ln <- function(model, data) {
v <- predict(model, data, type="response", se.fit=TRUE)
cbind(p=as.vector(exp(v$fit)), se=as.vector(exp(v$fit+v$se.fit)-exp(v$fit)))
}
#for the gams, we need to create a raster of lons and lats to predict on
ra.lon <- raster(all_env_stack$bathymetry)
ra.lon[] <- coordinates(stack(all_env_stack)$bathymetry)[,1]
ra.lat <- raster(all_env_stack$bathymetry)
ra.lat[] <- coordinates(stack(all_env_stack)$bathymetry)[,2]
ra.lon <- rast(ra.lon)
ra.lat <- rast(ra.lat)
all_env_stack$x <- ra.lon
all_env_stack$y <- ra.lat
#generate prediction maps
pred.n <- terra::predict(all_env_stack, fit.n, fun=predfun, index=1:2)
pred.ln <- predict(all_env_stack, fit.ln, fun=predfun.ln, index=1:2)
pred.p <- predict(all_env_stack, fit.p, fun=predfun, index=1:2)
# pred.nb <- predict(all_rasts_stack, fit.nb, fun=predfun, index=1:2)
pred.gam <- predict(all_env_stack, fit.gam, fun=predfun, index=1:2)
pred.gam2 <- predict(all_env_stack, fit.gam2, fun=predfun, index=1:2)
names(pred.ln) <- names(pred.p) <- names(pred.gam) <- names(pred.gam2) <- c("mean prediction", "std error")
## brt prediction is a bit different here
all_env_stack$bathy2 <- all_env_stack$bathymetry^2
## rf prediction
pred.rf <- predict(all_env_stack, fit.rf, type="response")
#some plotting parameters:
pt.cex <- c(sqrt(10/pi)/2,sqrt(50/pi)/2,sqrt(100/pi)/2)
cex <- sqrt(all_rasts$fishing_hours/pi)/2
l <- c(10,50,100)
Now plot the predictions!
fish_eff_rast <- all_rasts %>%
dplyr::select(x, y, fishing_hours) %>%
rast(., type = "xyz")
## normal
## Plot spatial predictions and overlay observed abundance
par(mfrow=c(1,3))
## mean
plot(pred.n,1, main="normal - mean")
## standard error
plot(pred.n,2, main="normal - se")
plot(fish_eff_rast, main = "Observed hours")
## lognormal
par(mfrow=c(1,3))
## mean
plot(pred.ln,1, main="lognormal - mean")
## standard error
plot(pred.ln,2, main="lognormal - se")
plot(fish_eff_rast, main = "Observed hours")
## poisson
par(mfrow=c(1,3))
## mean
plot(pred.p,1, main="poisson - mean")
## standard error
plot(pred.p,2, main="poisson - se")
plot(fish_eff_rast, main = "Observed hours")
## gam
par(mfrow=c(1,3))
## mean
plot(pred.gam,1, main="GAM - mean")
## standard error
plot(pred.gam,2, main="GAM - se")
plot(fish_eff_rast, main = "Observed hours")
## gam with smoothing
par(mfrow=c(1,3))
## mean
plot(pred.gam2,1, main="GAM (smoothed) - mean")
## standard error
plot(pred.gam2,2, main="GAM (smoothed) - se")
plot(fish_eff_rast, main = "Observed hours")
# i think this looks the best
## random forest
par(mfrow=c(1,2))
## mean
plot(pred.rf,1, main="Random Forest - mean")
plot(fish_eff_rast, main = "Observed hours")
## ok this is the best by far
## is the RF overfit?
# Is fishing hugging the EEZ?
Do same analysis but with Iceland since they have the best coverage
eez_isl_df <- eez_rast_df %>%
filter(rgn_id == 143) %>%
dplyr::select(x, y) %>%
mutate(eez = 1)
eez_isl_rast <- rast(eez_isl_df, crs = crs(eez_rast), type = "xyz")
# depth
bathy_isl <- crop(bathy, eez_isl_rast) %>% # crop to correct region
project(., eez_isl_rast) %>% # make sure extents match
mask(., eez_isl_rast) # now mask out any areas not in EEZ
## distance to short
dts_isl <- crop(dts, eez_isl_rast) %>% # crop to correct region
project(., eez_isl_rast) %>% # make sure extents match
mask(., eez_isl_rast) # now mask out any areas not in EEZ
# distance to port
dtp_isl <- crop(dtp, eez_isl_rast) %>% # crop to correct region
project(., eez_isl_rast) %>% # make sure extents match
mask(., eez_isl_rast) # now mask out any areas not in EEZ
# sst
sst_isl <- crop(sst, eez_isl_rast) %>% # crop to correct region
project(., eez_isl_rast) %>% # make sure extents match
mask(., eez_isl_rast) # now mask out any areas not in EEZ
chl_isl <- crop(chla, eez_isl_rast) %>% # crop to correct region
project(., eez_isl_rast) %>% # make sure extents match
mask(., eez_isl_rast) %>% # now mask out any areas not in EEZ
rename(chl = mean)
# effort hours
effort_isl <- effort_subset %>%
group_by(x, y) %>%
summarise(fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
rast(., type = "xyz", crs = crs(eez_isl_rast)) %>%
crop(., eez_isl_rast) %>%
project(., eez_isl_rast) %>%
tidyterra::mutate(fishing_hours = ifelse(is.na(fishing_hours), 0, fishing_hours)) %>%
mask(., eez_isl_rast) # ok so as.data.frame removes NAs?
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# now stack all rasters
all_rasts <- c(effort_isl, dtp_isl, dts_isl, bathy_isl, sst_isl, chl_isl) %>%
as.data.frame(., xy = TRUE, na.rm = NA) %>%
mutate(fis_pres = as.factor(ifelse(fishing_hours > 0, 1, 0))) %>%
#dplyr::select(-fishing_hours) %>%
filter(!is.na(bathymetry),
!is.na(mean),
!is.na(chl)) %>% # need to filter out NAs for RF to work
rename(dtp = 4, dts = 5, sst = mean)
all_env_stack <- c(dtp_isl, dts_isl, bathy_isl, sst_isl, chl_isl) %>%
rename(dtp = 1, dts = 2, sst = mean)
## first lets look at correlations to see if we should remove any variables
chart.Correlation(all_rasts[,4:8])
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## dts and dtp are really highly correlated.. makes a lot of sense - will want to remove one of those. My guess is distance to port is more important for Argentina, considering they have a lot of industrial fishing
all_rasts_2 <- all_rasts %>% dplyr::select(-dts)
chart.Correlation(all_rasts_2[,4:7]) # ok, now depth and dtp are correlated.. For this simple analysis, I'll keep depth
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
chart.Correlation(all_rasts_2[,5:7]) # look ok now
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
Make models with magnitude of hours
all_rasts <- all_rasts %>%
filter(fishing_hours > 0)
## lets check the hours of effort against predictor variables
par(mfrow=c(3,2))
for(i in 4:8){
plot(all_rasts[,1]~all_rasts[,i], ylab="Effort hours", xlab=names(all_rasts)[i], main=paste0("Effort vs ",names(all_rasts)[i]))
l <- loess(all_rasts[,1]~all_rasts[,i])
j <- order(all_rasts[,i]) ## values need to be ordered for the red line to run along and not zigzag
lines(all_rasts[j,i],l$fitted[j], col="red")
}
## ok so sst is the most obvious polynomial.
## Rounding will introduce only little error, so we'll round the values to be able to use poisson and negative binomial models, and compare each method with each other (e.g. poisson and gaussian model)
all_rasts$fishing_hours <- ceiling(all_rasts$fishing_hours)
##
effort.formula <- formula("fishing_hours ~ bathymetry+sst+chl+I(sst^2)")
## we also setup the formula for the lognormal model, which uses the log-transformed response:
effort.formula.ln <- formula("log(fishing_hours) ~ bathymetry+sst+chl+I(sst^2)")
## GLMs
## fitting the model and stepwise dropping variables using stepAIC, and the option "trace=FALSE" to not display each step
fit.n <- stepAIC(glm(effort.formula, data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
## check the summary and diagnostics
summary(fit.n)
##
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2),
## family = gaussian(link = "identity"), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -653.55 -63.20 -28.59 24.62 2024.70
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -86.845122 15.648895 -5.550 2.98e-08 ***
## bathymetry 0.053140 0.003492 15.217 < 2e-16 ***
## sst 39.770699 4.794373 8.295 < 2e-16 ***
## chl 90.399407 4.561383 19.818 < 2e-16 ***
## I(sst^2) -3.155503 0.386519 -8.164 3.90e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 18508.85)
##
## Null deviance: 138571608 on 6134 degrees of freedom
## Residual deviance: 113459244 on 6130 degrees of freedom
## AIC: 77700
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.n)
## AIC is realllllly big. This is a horrible model.
## lognormal
fit.ln <- stepAIC(glm("log(ceiling(fishing_hours)) ~ bathymetry+sst+chl+I(sst^2)", data=all_rasts, family=gaussian(link="identity")), trace=FALSE)
summary(fit.ln)
##
## Call:
## glm(formula = log(ceiling(fishing_hours)) ~ bathymetry + sst +
## chl + I(sst^2), family = gaussian(link = "identity"), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.4044 -0.9915 0.1312 1.0554 4.3309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.916996 0.165822 5.53 3.33e-08 ***
## bathymetry 0.001755 0.000037 47.44 < 2e-16 ***
## sst 0.767881 0.050803 15.12 < 2e-16 ***
## chl 0.912231 0.048334 18.87 < 2e-16 ***
## I(sst^2) -0.061727 0.004096 -15.07 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.078247)
##
## Null deviance: 24102 on 6134 degrees of freedom
## Residual deviance: 12740 on 6130 degrees of freedom
## AIC: 21905
##
## Number of Fisher Scoring iterations: 2
par(mfrow=c(2,2))
plot(fit.ln)
mu <- fitted(fit.ln)
sigma <- summary(fit.ln)$df.residual/nrow(all_rasts)*sqrt(summary(fit.ln)$dispersion)
-2*sum(log(dlnorm(all_rasts$fishing_hours,mu,sigma)))+2*(length(coef(fit.ln))+1)
## [1] 57602.44
## this one is a bit better, but still not great. AIC is lower and plots look not as skewed
## poisson
fit.p <- stepAIC(glm(effort.formula, data=all_rasts, family=poisson()), trace=FALSE)
summary(fit.p)
##
## Call:
## glm(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2),
## family = poisson(), data = all_rasts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -38.611 -7.638 -2.844 1.577 99.817
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.008e+00 2.612e-02 153.47 <2e-16 ***
## bathymetry 2.855e-03 9.296e-06 307.14 <2e-16 ***
## sst 3.539e-01 7.773e-03 45.52 <2e-16 ***
## chl 2.316e-01 2.490e-03 93.01 <2e-16 ***
## I(sst^2) -2.689e-02 5.931e-04 -45.34 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 988872 on 6134 degrees of freedom
## Residual deviance: 596793 on 6130 degrees of freedom
## AIC: 626177
##
## Number of Fisher Scoring iterations: 6
par(mfrow=c(2,2))
plot(fit.p)
## yikes... still bad
## Negative binomial
fit.nb <- stepAIC(glm.nb(effort.formula, data=all_rasts), trace=FALSE)
summary(fit.nb)
##
## Call:
## glm.nb(formula = fishing_hours ~ bathymetry + sst + chl + I(sst^2),
## data = all_rasts, init.theta = 0.7168794565, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0915 -1.1361 -0.5101 0.1812 6.2808
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.194e+00 1.433e-01 15.31 <2e-16 ***
## bathymetry 1.936e-03 3.477e-05 55.68 <2e-16 ***
## sst 6.615e-01 4.384e-02 15.09 <2e-16 ***
## chl 8.145e-01 4.007e-02 20.33 <2e-16 ***
## I(sst^2) -5.183e-02 3.546e-03 -14.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.7169) family taken to be 1)
##
## Null deviance: 12493.1 on 6134 degrees of freedom
## Residual deviance: 7054.1 on 6130 degrees of freedom
## AIC: 58406
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.7169
## Std. Err.: 0.0118
##
## 2 x log-likelihood: -58393.6060
par(mfrow=c(2,2))
plot(fit.nb)
# overfit?
fit.rf <- randomForest(effort.formula, data=all_rasts, ntree=500)
fit.rf
##
## Call:
## randomForest(formula = effort.formula, data = all_rasts, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 12503.08
## % Var explained: 44.64
par(mfrow=c(2,1))
plot(fit.rf) # depth by far most important
## GAMs - add smooth spatial term
fit.gam <- gam(ceiling(fishing_hours) ~ bathymetry+sst+I(sst^2)+chl+ s(x,y), data=all_rasts, family=nb())
summary(fit.gam)
##
## Family: Negative Binomial(0.918)
## Link function: log
##
## Formula:
## ceiling(fishing_hours) ~ bathymetry + sst + I(sst^2) + chl +
## s(x, y)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.672e+00 2.345e-01 7.129 1.01e-12 ***
## bathymetry 2.165e-03 7.193e-05 30.093 < 2e-16 ***
## sst 1.475e+00 7.047e-02 20.923 < 2e-16 ***
## I(sst^2) -1.396e-01 7.405e-03 -18.857 < 2e-16 ***
## chl -1.617e-02 5.209e-02 -0.310 0.756
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(x,y) 28.29 28.97 2513 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.228 Deviance explained = 57.2%
## -REML = 28360 Scale est. = 1 n = 6135
par(mfrow=c(2,2))
plot(fit.gam)
## R2 not great... but not a huge deal. Not really sure how to interpret the plot...
fit.gam2 <- gam(ceiling(fishing_hours) ~ s(bathymetry)+s(sst)+s(chl)+s(x, y), data=all_rasts, family=nb())
summary(fit.gam2)
##
## Family: Negative Binomial(0.981)
## Link function: log
##
## Formula:
## ceiling(fishing_hours) ~ s(bathymetry) + s(sst) + s(chl) + s(x,
## y)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5069 0.0136 257.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(bathymetry) 8.231 8.842 977.41 <2e-16 ***
## s(sst) 7.412 8.393 457.18 <2e-16 ***
## s(chl) 7.136 8.204 76.08 <2e-16 ***
## s(x,y) 28.294 28.942 1981.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.232 Deviance explained = 60.8%
## -REML = 28108 Scale est. = 1 n = 6135
par(mfrow=c(2,2))
plot(fit.gam2)
## seems a bit better? Based on R2 and deviance explained... however everythign is significant, so maybe not
Make predictions
#extract both mean and se of predictions
predfun <- function(model, data) {
v <- predict(model, data, type="response", se.fit=TRUE)
cbind(p=as.vector(v$fit), se=as.vector(v$se.fit))
}
## for the lognormal:
predfun.ln <- function(model, data) {
v <- predict(model, data, type="response", se.fit=TRUE)
cbind(p=as.vector(exp(v$fit)), se=as.vector(exp(v$fit+v$se.fit)-exp(v$fit)))
}
#for the gams, we need to create a raster of lons and lats to predict on
ra.lon <- raster(all_env_stack$bathymetry)
ra.lon[] <- coordinates(stack(all_env_stack)$bathymetry)[,1]
ra.lat <- raster(all_env_stack$bathymetry)
ra.lat[] <- coordinates(stack(all_env_stack)$bathymetry)[,2]
ra.lon <- rast(ra.lon)
ra.lat <- rast(ra.lat)
all_env_stack$x <- ra.lon
all_env_stack$y <- ra.lat
#generate prediction maps
pred.n <- terra::predict(all_env_stack, fit.n, fun=predfun, index=1:2)
pred.ln <- predict(all_env_stack, fit.ln, fun=predfun.ln, index=1:2)
pred.p <- predict(all_env_stack, fit.p, fun=predfun, index=1:2)
pred.nb <- predict(all_env_stack, fit.nb, fun=predfun, index=1:2)
pred.gam <- predict(all_env_stack, fit.gam, fun=predfun, index=1:2)
pred.gam2 <- predict(all_env_stack, fit.gam2, fun=predfun, index=1:2)
names(pred.ln) <- names(pred.p) <- names(pred.gam) <- names(pred.gam2) <- c("mean prediction", "std error")
## brt prediction is a bit different here
all_env_stack$bathy2 <- all_env_stack$bathymetry^2
## rf prediction
pred.rf <- predict(all_env_stack, fit.rf, type="response")
#some plotting parameters:
pt.cex <- c(sqrt(10/pi)/2,sqrt(50/pi)/2,sqrt(100/pi)/2)
cex <- sqrt(all_rasts$fishing_hours/pi)/2
l <- c(10,50,100)
Now plot the predictions!
fish_eff_rast <- all_rasts %>%
dplyr::select(x, y, fishing_hours) %>%
rast(., type = "xyz")
## normal
## Plot spatial predictions and overlay observed abundance
par(mfrow=c(1,3))
## mean
plot(pred.n,1, main="normal - mean")
## standard error
plot(pred.n,2, main="normal - se")
plot(fish_eff_rast, main = "Observed hours")
## lognormal
par(mfrow=c(1,3))
## mean
plot(pred.ln,1, main="lognormal - mean")
## standard error
plot(pred.ln,2, main="lognormal - se")
plot(fish_eff_rast, main = "Observed hours")
# yikes
## poisson
par(mfrow=c(1,3))
## mean
plot(pred.p,1, main="poisson - mean")
## standard error
plot(pred.p,2, main="poisson - se")
plot(fish_eff_rast, main = "Observed hours")
## gam
par(mfrow=c(1,3))
## mean
plot(pred.gam,1, main="GAM - mean")
## standard error
plot(pred.gam,2, main="GAM - se")
plot(fish_eff_rast, main = "Observed hours")
## gam with smoothing
par(mfrow=c(1,3))
## mean
plot(pred.gam2,1, main="GAM (smoothed) - mean")
## standard error
plot(pred.gam2,2, main="GAM (smoothed) - se")
plot(fish_eff_rast, main = "Observed hours")
# i think this looks the best
## random forest
par(mfrow=c(1,2))
## mean
plot(pred.rf,1, main="Random Forest - mean predictions")
plot(fish_eff_rast, main = "Observed hours")
## ok this is the best by far but is it overfit?